import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score,train_test_split, KFold, GridSearchCV, ParameterGrid, \
RandomizedSearchCV, RepeatedKFoldfrom sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor,BaggingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, \
accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score, mean_squared_errorfrom skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.plots import plot_convergence, plot_histogram, plot_objective
from IPython import display
import itertools as it
#Libraries for visualizing trees
from sklearn.tree import export_graphviz, export_text
from six import StringIO
from IPython.display import Image
import pydotplus
import time as time
import warnings
8 Bagging (addendum)
This notebook provides examples to:
Compare tuning bagging hyperparameters with OOB validation and \(k\)-fold cross-validation.
Compare bagging tuned models with untuned models.
#Using the same datasets as in linear regression in STAT303-2,
#so that we can compare the non-linear models with linear regression
= pd.read_csv('./Datasets/Car_features_train.csv')
trainf = pd.read_csv('./Datasets/Car_prices_train.csv')
trainp = pd.read_csv('./Datasets/Car_features_test.csv')
testf = pd.read_csv('./Datasets/Car_prices_test.csv')
testp = pd.merge(trainf,trainp)
train = pd.merge(testf,testp)
test train.head()
carID | brand | model | year | transmission | mileage | fuelType | tax | mpg | engineSize | price | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18473 | bmw | 6 Series | 2020 | Semi-Auto | 11 | Diesel | 145 | 53.3282 | 3.0 | 37980 |
1 | 15064 | bmw | 6 Series | 2019 | Semi-Auto | 10813 | Diesel | 145 | 53.0430 | 3.0 | 33980 |
2 | 18268 | bmw | 6 Series | 2020 | Semi-Auto | 6 | Diesel | 145 | 53.4379 | 3.0 | 36850 |
3 | 18480 | bmw | 6 Series | 2017 | Semi-Auto | 18895 | Diesel | 145 | 51.5140 | 3.0 | 25998 |
4 | 18492 | bmw | 6 Series | 2015 | Automatic | 62953 | Diesel | 160 | 51.4903 | 3.0 | 18990 |
= train[['mileage','mpg','year','engineSize']]
X = test[['mileage','mpg','year','engineSize']]
Xtest = train['price']
y = test['price'] ytest
- Tree without tuning
- Tree performance improves with tuning
- Bagging tuned tree
- Bagging untuned tree - better, how?
- Tuning bagged model - OOB
- Tuning bagged model - BayesSearchCV
- warm start
- Bagging KNN - no need to tune number of neighbors
8.1 Tree without tuning
= DecisionTreeRegressor()
model = KFold(n_splits=5, shuffle=True, random_state=1)
cv -np.mean(cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv = cv))
7056.960817154941
= {'max_depth': Integer(2, 30)}
param_grid = BayesSearchCV(model, search_spaces = param_grid, cv = cv, n_iter = 40, random_state = 10,
gcv = 'neg_root_mean_squared_error', n_jobs = -1)
scoring = list(gcv.search_spaces.keys())
paras
paras.sort()
def monitor(optim_result):
= pd.Series(optim_result['func_vals']).cummin()
cv_values = True)
display.clear_output(wait = pd.Series(optim_result['func_vals']).argmin()
min_ind print(paras, "=", optim_result['x_iters'][min_ind], pd.Series(optim_result['func_vals']).min())
sns.lineplot(cv_values)
plt.show()= monitor) gcv.fit(X, y, callback
['max_depth'] = [10] 6341.1481858990355
BayesSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True), estimator=DecisionTreeRegressor(), n_iter=40, n_jobs=-1, random_state=10, scoring='neg_root_mean_squared_error', search_spaces={'max_depth': Integer(low=2, high=30, prior='uniform', transform='normalize')})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BayesSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True), estimator=DecisionTreeRegressor(), n_iter=40, n_jobs=-1, random_state=10, scoring='neg_root_mean_squared_error', search_spaces={'max_depth': Integer(low=2, high=30, prior='uniform', transform='normalize')})
DecisionTreeRegressor()
DecisionTreeRegressor()
8.2 Performance of tree improves with tuning
= DecisionTreeRegressor(max_depth=10)
model = KFold(n_splits=5, shuffle=True, random_state=1)
cv -np.mean(cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv = cv))
6442.494300778735
8.3 Bagging tuned trees
= BaggingRegressor(DecisionTreeRegressor(max_depth = 10), oob_score=True, n_estimators = 100).fit(X, y)
model = False) mean_squared_error(model.oob_prediction_, y, squared
5354.357809020438
8.4 Bagging untuned trees
= BaggingRegressor(DecisionTreeRegressor(), oob_score=True, n_estimators = 100).fit(X, y)
model = False) mean_squared_error(model.oob_prediction_, y, squared
5248.720845665685
Why is bagging tuned trees worse than bagging untuned trees?
In the tuned tree here, the reduction in variance by controlling maximum depth resulted in an increas in bias of indivudual trees. Bagging trees only reduces the variance, but not the bias of the indivudal trees. Thus, bagging high bias models will result in a high-bias model, while bagging high variance models may result in a low variance model if the models are not highly correlated.
Bagging tuned models may provide a better performance as compared to bagging untuned models if the reduction in variance of the individual models is high enough to overshadow the increase in bias, and increase in pairwise correlation of the individual models.
8.5 Tuning bagged model - OOB
= {'max_samples': [0.25, 0.5, 0.75, 1.0],
param_grid1 'max_features': [2, 3, 4],
'bootstrap_features': [True, False]}
= {'max_samples': [0.25, 0.5, 0.75, 1.0],
param_grid2 'max_features': [1],
'bootstrap_features': [False]}
= list(it.product(*[values for key, values in param_grid1.items()]))
param_list1 = list(it.product(*[values for key, values in param_grid2.items()]))
param_list2 = param_list1 + param_list2 param_list
= []
oob_score_pr for pr in param_list:
= BaggingRegressor(DecisionTreeRegressor(), max_samples=pr[0], max_features=pr[1],
model =pr[2], n_jobs = -1, oob_score=True, n_estimators = 50).fit(X, y)
bootstrap_features=False)) oob_score_pr.append(mean_squared_error(model.oob_prediction_, y, squared
What is the benefit of OOB validation to tune hyperparameters in bagging?
It is much cheaper than \(k\)-fold cross-validation, as only \(1/k\) of the models are trained with OOB validation as compared to \(k\)-fold cross-validation. However, the cost of training individual models is lower in \(k\)-fold cross-validation as models are trained on a smaller dataset. Typically, OOB will be faster than \(k\)-fold cross-validation. The higher the value of \(k\), the more faster OOB validation will be as compared to \(k\)-fold cross-validation.
8.6 Tuning without k-fold cross-validation
When hyperparameters can be tuned with OOB validation, what is the benefit of using k-fold cross-validation?
Hyperparameters cannot be tuned over continuous spaces with OOB validation.
OOB score is not computed if samping is done without replacement (
bootstrap = False
). Thus, for tuning thebootstrap
hyperparameter, \(k\)-fold cross-validation will need to be used.
def monitor(optim_result):
= pd.Series(optim_result['func_vals']).cummin()
cv_values = True)
display.clear_output(wait = pd.Series(optim_result['func_vals']).argmin()
min_ind print(paras, "=", optim_result['x_iters'][min_ind], pd.Series(optim_result['func_vals']).min())
sns.lineplot(cv_values) plt.show()
= {'max_samples': Real(0.2, 1.0),
param_grid 'max_features': Integer(1, 4),
'bootstrap_features': [True, False],
'bootstrap': [True, False]}
= BayesSearchCV(BaggingRegressor(DecisionTreeRegressor(), bootstrap=False),
gcv = param_grid, cv = cv, n_jobs = -1,
search_spaces ='neg_root_mean_squared_error')
scoring
= list(gcv.search_spaces.keys())
paras
paras.sort()
=monitor) gcv.fit(X, y, callback
['bootstrap', 'bootstrap_features', 'max_features', 'max_samples'] = [True, False, 4, 0.8061354588503475] 5561.064432968422
BayesSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True), estimator=BaggingRegressor(bootstrap=False, estimator=DecisionTreeRegressor()), n_jobs=-1, scoring='neg_root_mean_squared_error', search_spaces={'bootstrap': [True, False], 'bootstrap_features': [True, False], 'max_features': Integer(low=1, high=4, prior='uniform', transform='normalize'), 'max_samples': Real(low=0.2, high=1.0, prior='uniform', transform='normalize')})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BayesSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True), estimator=BaggingRegressor(bootstrap=False, estimator=DecisionTreeRegressor()), n_jobs=-1, scoring='neg_root_mean_squared_error', search_spaces={'bootstrap': [True, False], 'bootstrap_features': [True, False], 'max_features': Integer(low=1, high=4, prior='uniform', transform='normalize'), 'max_samples': Real(low=0.2, high=1.0, prior='uniform', transform='normalize')})
BaggingRegressor(bootstrap=False, estimator=DecisionTreeRegressor())
DecisionTreeRegressor()
DecisionTreeRegressor()
0],0) plot_histogram(gcv.optimizer_results_[
0]) plot_objective(gcv.optimizer_results_[
8.7 warm start
What is the purpose of warm_start?
The purpose of warm_start
is to avoid developing trees from scratch, and incrementally add trees to monitor the validation error. However, note that OOB score is not computed with warm_start
. Thus, a validation set approach will need to be adopted to tune number of trees.
A cheaper approach to tune number of estimators is to just use trial and error, and stop increasing once the cross-validation error / OOB error / validation set error stabilizes.
= BaggingRegressor(DecisionTreeRegressor(), oob_score=False, n_estimators = 5,
model =True).fit(X, y)
warm_start= []
rmse for i in range(10, 200, 10):
= i
model.n_estimators
model.fit(X, y)=False))
rmse.append(mean_squared_error(model.predict(Xtest), ytest, squared= range(10, i+1, 10), y = rmse) sns.lineplot(x
8.8 Bagging KNN
Should we bag a tuned KNN model or an untuned one?
from sklearn.preprocessing import StandardScaler
= KNeighborsRegressor(n_neighbors=9) # optimal neigbors
model = StandardScaler()
scaler
scaler.fit(X)= scaler.transform(X)
X_scaled -np.mean(cross_val_score((model), X_scaled, y, cv = cv,
='neg_root_mean_squared_error', n_jobs = -1)) scoring
6972.997277781689
= KNeighborsRegressor(n_neighbors=1)
model = StandardScaler()
scaler
scaler.fit(X)= scaler.transform(X)
X_scaled -np.mean(cross_val_score(BaggingRegressor(model), X_scaled, y, cv = cv,
='neg_root_mean_squared_error', n_jobs = -1)) scoring
6254.305462266355
= BaggingRegressor(DecisionTreeRegressor(), n_estimators=5, warm_start=True)
model
model.fit(X, y)= []
rmse for i in range(10, 200,10):
= i
model.n_estimators
model.fit(X, y)=False))
rmse.append(mean_squared_error(model.predict(Xtest), ytest, squared= range(10, i + 1, 10), y = rmse) sns.lineplot(x